This is a work in progress.
In [90]:
import Bio
from Bio import AlignIO
from Bio import Phylo
import inspect
import os
from itertools import imap, groupby, starmap
#specify dir where alignment files are located
aligned_dir = "Biopython Alignment"
In [4]:
#optional step
alignment = AlignIO.read(os.path.join(aligned_dir,'all_seq.aln'),'clustal')
print alignment
In [8]:
#http://biopython.org/wiki/Phylo_cookbook
#http://biopython.org/wiki/Phylo
tree = Phylo.read(os.path.join(aligned_dir,'all_seq.dnd'),"newick")
Phylo.draw_ascii(tree)
print tree
In [136]:
def lookup_by_names(tree):
names = {}
for clade in tree.find_clades():
if clade.name:
if clade.name in names:
raise ValueError("Duplicate key: %s" % clade.name)
names[clade.name] = clade
return names
names = lookup_by_names(tree)
for phylum in ('h2009','s2009'):
print "Phylum size", len(names[phylum].get_terminals())
def tabulate_names(tree):
names = {}
for idx, clade in enumerate(tree.find_clades()):
if clade.name:
clade.name = '%s_i%d' % (clade.name,idx)
else:
clade.name = str(idx)
names[clade.name] = clade
return names
tree2 =copy.deepcopy(tree)
tabulate_names(tree2)
Phylo.draw_ascii(tree2)
In [ ]:
'''
Calculate distances between neighboring terminals
Suggested by Joel Berendzen
'''
import itertools
def terminal_neighbor_dists(self):
"""Return a list of distances between adjacent terminals."""
def generate_pairs(self):
pairs = itertools.tee(self)
pairs[1].next()
return itertools.izip(pairs[0], pairs[1])
return [self.distance(*i) for i in
generate_pairs(self.find_clades(terminal=True))]
depths method:
Create a mapping of tree clades to depths. The result is a dictionary where the keys are all of the Clade instances in the tree, and the values are the distance from the root to each clade (including terminals). By default the distance is the cumulative branch length leading to the clade, but with the unit_branch_lengths=True option, only the number of branches (levels in the tree) is counted. (http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc161)
In [9]:
tree_depths = tree.depths()
tree_depths
Out[9]:
In [65]:
name_depths_dict = {i.name:tree_depths[i] for i in tree_depths if i.name}
name_depths_dict
Out[65]:
In [103]:
#make list of two lists where the first list is the human stains and the second is the swine strains
def group_keys_by_host(keys):
strains = sorted(keys)
strains_by_host = [[i for i in group]for key, group in groupby(strains, lambda x: x[0])]
return strains_by_host
strains_by_host = group_keys_by_host(name_depths_dict.keys())
strains_by_host
Out[103]:
In [77]:
#make list of tuples where the first item is the human stain for one year and the second is the swine strain for same year
strains_by_year = [i for i in zip(strains_by_host[0],strains_by_host[1]) if i[0][1:] == i[1][1:]]
strains_by_year
Out[77]:
In [128]:
def compare(key_seq, vals_dict):
ks = copy.deepcopy(key_seq)
for i in key_seq:
#print i,": "
for ii in ks:
#print '\t', ii,
ks.remove(i)
#print
#print key_seq
#starmap(strains_by_host[0])
strains_by_host = group_keys_by_host(name_depths_dict.keys())
compare(strains_by_host[0], name_depths_dict)
In [146]:
def compare_genomes(genome_pair):
gen1, gen2 = genome_pair[0],genome_pair[1]
key = gen1+"-"+gen2
return key, name_depths_dict[gen1] - name_depths_dict[gen2]
In [151]:
import itertools
strains_by_host = group_keys_by_host(name_depths_dict.keys())
year_diffs={}
for i in strains_by_host:
matched = list(itertools.combinations(i,2))
for ii in matched:
key, val = compare_genomes(ii)
year_diffs[key] = val
print key,": ", year_diffs[key]
h, s = group_keys_by_host(year_diffs.keys())
In [149]:
year_diffs = {}
for i in strains_by_year:
key, val = compare_genomes(i)
year_diffs[key] = val
print key,": ", year_diffs[key]
year_diffs
Out[149]:
In [ ]: